from pyspark.sql import SparkSessionimport pandas as pdimport plotly.express as pximport plotly.io as piopio.renderers.default ="svg"import reimport numpy as npimport plotly.graph_objects as gofrom pyspark.sql.functions import col, split, explode, regexp_replace, transform, whenfrom pyspark.sql import functions as Ffrom pyspark.sql.functions import col, monotonically_increasing_idnp.random.seed(42)pio.renderers.default ="notebook"# Initialize Spark Sessionspark = SparkSession.builder.appName("./data/LightcastData").getOrCreate()# Load Datadf = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")# Show Schema and Sample Data#print("---This is Diagnostic check, No need to print it in the final doc---")#df.printSchema() # comment this line when rendering the submission#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/23 23:53:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 1:> (0 + 1) / 1]
5 Salary Analysis by ONET Occupation Type (Bubble Chart)
df.createOrReplaceTempView("Job_Postings")
25/09/23 23:55:10 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
salary_analysis = spark.sql(""" Select TITLE_NAME AS ONET_NAME, PERCENTILE(SALARY, 0.5) AS Median_Salary, COUNT(*) AS Job_Postings FROM Job_Postings GROUP BY TITLE_NAME ORDER BY Job_Postings DESC LIMIT 10""")salary_pd = salary_analysis.toPandas()salary_pd.head()import plotly.express as pxfig = px.scatter( salary_pd, x ="ONET_NAME", y="Median_Salary", size="Job_Postings", title="Median Salary by ONET Occupation Type (Bubble Chart)", labels = {"ONET_NAME": "ONET Occupation", "Median_Salary": "Median Salary", "Job_Postings": "NUmber of Job Postings" }, hover_name ="ONET_NAME", size_max =60, width=1000, height=600, color="Job_Postings", color_continuous_scale="Plasma")fig.update_layout( font_family="Arial", font_size =14, title_font_size=25, xaxis_title ="ONET Occupation", yaxis_title="Median Salary", plot_bgcolor="white", xaxis=dict( tickangle=-45, showline=True, linecolor="black" ), yaxis=dict( showline=True, linecolor="black" ))fig.show()fig.write_html("output/Q3.html")fig.write_image("output/Q3.svg", width=1100, height=900, scale=1)